import pandas as pd
import numpy as np
from langdetect import detect as langdetect
import random
seed = 42
np.random.seed(seed)
random.seed(seed)
import sys
sys.path.append("..")
from data.labeled.raw import CWA as cwa, CWR as cwr
from data.labeled.preprocessed import LABELED_CITIES_PATH, RISKS_MAPPING_PATH, SEVERITY_MAPPING_PATH, IMPUTATION_REPORT_PATH

Preprocess whole dataset¶

cwa.head()

cwr.head()

Normalize columns names¶

# Convert columns names to lowercase
cwa.columns = cwa.columns.str.lower().str.replace(" ", "_").str.replace("'","")
cwr.columns = cwr.columns.str.lower().str.replace(" ", "_").str.replace("'","")

Find counts of nulls per column¶

cwr.isnull().sum()

account_no                       0
organisation                     0
city                             0
country                          0
region                           0
access                           0
c40                            504
reporting_year                   0
risks_to_citys_water_supply     40
timescale                       11
magnitude                      159
risk_description               107
current_population               0
population_year                  0
city_location                    0
dtype: int64

cwa.isnull().sum()

account_no                       0
organisation                     0
city                             0
country                          0
region                           0
access                           0
c40                            536
reporting_year                   0
risks_to_citys_water_supply     75
adaptation_action               19
action_description             146
city_location                    0
dtype: int64

Find columns that exist in both datasets¶

id_columns = list(set(cwr.columns) & set(cwa.columns))
id_columns

['region',
 'city',
 'access',
 'country',
 'risks_to_citys_water_supply',
 'c40',
 'organisation',
 'city_location',
 'account_no',
 'reporting_year']

Check how many nulls exist in these columns¶

cwr[id_columns].isnull().sum()

region                           0
city                             0
access                           0
country                          0
risks_to_citys_water_supply     40
c40                            504
organisation                     0
city_location                    0
account_no                       0
reporting_year                   0
dtype: int64

cwa[id_columns].isnull().sum()

region                           0
city                             0
access                           0
country                          0
risks_to_citys_water_supply     75
c40                            536
organisation                     0
city_location                    0
account_no                       0
reporting_year                   0
dtype: int64

Replace nulls there with "nan" strings, so that merging is done correctly¶

cwr['c40'] = cwr['c40'] == 'C40'
cwr[id_columns] = cwr[id_columns].fillna('nan')

cwa['c40'] = cwa['c40'] == 'C40'
cwa[id_columns] = cwa[id_columns].fillna('nan')

Check id_columns uniqueness¶

len(cwa[id_columns]), len(cwa[id_columns].drop_duplicates())

(685, 484)

len(cwr[id_columns]), len(cwr[id_columns].drop_duplicates())

(608, 578)

Aggregate by grouping by id_columns and applying set operations to other colums¶

cwa_agg = cwa.groupby(id_columns).agg(set).reset_index()
cwr_agg = cwr.groupby(id_columns).agg(set).reset_index()

len(cwa_agg[id_columns]), len(cwa_agg[id_columns].drop_duplicates())

(484, 484)

len(cwr_agg[id_columns]), len(cwr_agg[id_columns].drop_duplicates())

(578, 578)

Make sure that risks magnitude (which is going to be used as an entry label) has a single value¶

cwr_agg.magnitude = cwr_agg.magnitude.apply(lambda x: 'Extremely serious' if 'Extremely serious' in x  else ('Serious' if 'Serious' in x else list(x)[0]))

cwr_agg.magnitude.unique()

array([nan, 'Serious', 'Extremely serious', 'Less Serious'], dtype=object)

Merge (outer) datasets and find the columns that are set (list_columns)¶

filling the non-merged rows with {np.nan} on list_columns to make sure there is contingency in each column

cw_data = pd.merge(cwa_agg, cwr_agg, on=id_columns, how='outer')
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, set))
list_columns = list_columns[list_columns].index
cw_data[list_columns] = cw_data[list_columns].applymap(lambda x: {np.nan} if pd.isna(x) else x)

len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(677, 677)

cw_data.head()

Extract longitude and latitude from city_location¶

cw_data['city_location'] = cw_data['city_location'].apply(eval)
cw_data['latitude'] = cw_data['city_location'].apply(lambda x: x[0])
cw_data['longitude'] = cw_data['city_location'].apply(lambda x: x[1])
id_columns = [x for x in id_columns] + ['latitude','longitude']
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(677, 677)

Find list columns that are singular (every element contains a length 1 list)¶

cw_value_columns = [ x for x in cw_data.columns if x not in id_columns]
# Values that can be converted back to strings from sets

singular_cols = cw_data.set_index(id_columns)[list_columns].applymap(lambda x: len(x)==1).product() > 0
singular_cols
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(677, 677)

Convert singular columns to non list ones¶

# magnitude is nicely singular per id
cw_data.loc[:, singular_cols[singular_cols].index] = cw_data.loc[:, singular_cols[singular_cols].index].applymap(lambda x: list(x)[0])
list_columns = [x for x in list_columns if x not in singular_cols[singular_cols].index]

Remove entries that are invalid or in different language than English from the list columns¶

also convert sets to lists

def try_lang_detect_except(x):
    try:
        return (langdetect(x)=='en')
    except:
        return False
cw_data = cw_data.applymap(lambda x: [t for t in x if not pd.isnull(t)  and  (t!='') and (t!='ERROR: #NAME?') and try_lang_detect_except(t)] if isinstance(x,set) else x)
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(677, 677)

Fix specific risks annotations by inspection¶

cw_data.risks_to_citys_water_supply.unique()

array(['Higher water prices', 'Inadequate or aging infrastructure',
       'Increased water stress or scarcity', 'Declining water quality',
       'nan', 'Increased water demand', 'Regulatory',
       'Energy supply issues', 'Inadequate or ageing infrastructure',
       'Declining water quality: Declining water quality'], dtype=object)

cw_data.risks_to_citys_water_supply = cw_data.risks_to_citys_water_supply.apply(lambda x: x.replace('Inadequate or ageing infrastructure','Inadequate or aging infrastructure'))
cw_data['risks_to_citys_water_supply'] = cw_data['risks_to_citys_water_supply'].apply(lambda x: x.replace('Declining water quality: Declining water quality', 'Declining water quality'))
cw_data.risks_to_citys_water_supply.unique()
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(677, 598)

cw_data.drop_duplicates(subset=id_columns,inplace=True)

cw_data.head()

to_drop = ['timescale','current_population','population_year']
cw_data['population_year'] = cw_data['population_year'].fillna(0)
t = cw_data.sort_values('population_year' ).groupby([x for x in cw_data.columns if (x !='population_year') and x not in list_columns],dropna=False).last().reset_index()

cw_data

Drop metadata columns that will not be used¶

cw_data.drop(columns=['timescale','population_year'],inplace=True)
cw_data.rename(columns={'current_population':'population'},inplace=True)

len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(598, 598)

Generate for each of the list columns a new one that contains the number of elements¶

Create value_columns that contains the names of all the list columns + the created ones

value_columns = set([x for x in cw_value_columns if x not in to_drop] + ['risks_to_citys_water_supply'])
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, list))
list_columns = list_columns[list_columns].index
len_columns = set()
for x in value_columns.copy():
    if x in list_columns:
        cw_data[x+'_n'] = cw_data[x].apply(lambda x: len(x) if x else 1)
        len_columns.add(x+'_n')
value_columns = value_columns.union(len_columns)
value_columns

{'action_description',
 'action_description_n',
 'adaptation_action',
 'adaptation_action_n',
 'magnitude',
 'risk_description',
 'risk_description_n',
 'risks_to_citys_water_supply'}

len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(598, 598)

Filter by C40¶

We are making the assumption that, given that those cities participate in a specific program, they will also have better structured data. So this subset will be used as a reference for the imputation

c40_data = cw_data[cw_data['c40']].copy()
c40_data.shape

(97, 20)

Find the null percentage of list columns¶

list_nulls = c40_data[list_columns].applymap(lambda x: len(x) == 0)
list_nulls.mean()

adaptation_action     0.268041
action_description    0.381443
risk_description      0.402062
dtype: float64

c40_data['magnitude'].unique()

array(['Extremely serious', nan, 'Serious', 'Less Serious'], dtype=object)

Create a report of the values of magnitude¶

c40_risks = c40_data[id_columns+['magnitude']].copy()
c40_risks['low'] = c40_risks['magnitude'] == 'Less Serious'
c40_risks['medium'] = c40_risks['magnitude'] == 'Serious'
c40_risks['high'] = c40_risks['magnitude'] == 'Extremely serious'
c40_risks['unknown'] = c40_risks['magnitude'].isna()
c40_risks = c40_risks.drop(columns=['magnitude']).groupby([x for x in id_columns if x!="risks_to_citys_water_supply"]).agg(sum).reset_index()
c40_risks

from sklearn.preprocessing import LabelEncoder

from utils.nlp import SimilarityAnalysis, create_sim_vector

c40_data.risks_to_citys_water_supply = c40_data.risks_to_citys_water_supply.apply(lambda x: np.nan if x=='nan' else x)

c40_data[c40_data.risks_to_citys_water_supply.isnull()]

c40_data.risks_to_citys_water_supply.unique()

array(['Increased water stress or scarcity',
       'Inadequate or aging infrastructure', nan,
       'Declining water quality', 'Energy supply issues',
       'Higher water prices', 'Increased water demand', 'Regulatory'],
      dtype=object)

Fit Similarity Analysis¶

per string list column of the value columns, by concatenating strings together. Vectorize all the string list columns based on their in between similarities

analyses = {x:SimilarityAnalysis() for x in value_columns if x in list_columns}

vectorized = np.hstack([analyses[x].fit_transform(c40_data[x]) for x in analyses])

Encode magnitude and risks into numerical labels¶

cols_to_encode =  [x for x in value_columns if x not in list_columns and x not in len_columns]
c40_data[cols_to_encode] = c40_data[cols_to_encode].fillna('nan')
encoders = {x:LabelEncoder() for x in cols_to_encode}

encoded = np.array([encoders[x].fit_transform(c40_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
    encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan

lab_freqs = c40_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
lab_freqs

Create dataset that contains the encoded labels, the vectorized columns and the columns referring the lists lengths¶

whole = np.hstack([encoded, vectorized, c40_data[len_columns]])

import sys
sys.path.append('..')
from labeled_preprocessing.imputation import LabeledDatasetImputer

Fit LabeledDatasetImputer and transform dataset¶

(LabeledDatasetImputer is a union of two Iterative Imputers , one for labels and one for features)

imputer = LabeledDatasetImputer(verbose=2,k_features_per_label=0, seed=seed)
continuous_imputed, y = imputer.fit_transform(whole[:,2:], whole[:,:2], ret_imputed_x=True)

Applying feature selection..
Creating imputed X using KNeighborsRegressor..
[IterativeImputer] Completing matrix with shape (97, 192)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.45
[IterativeImputer] Change: 16.78842081477814, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.88
[IterativeImputer] Change: 7.823153835169504, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 3/10, elapsed time 1.57
[IterativeImputer] Change: 3.379485724678483, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 4/10, elapsed time 2.09
[IterativeImputer] Change: 0.021456167679847482, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 5/10, elapsed time 2.51
[IterativeImputer] Change: 3.3306690738754696e-16, scaled tolerance: 0.011 
[IterativeImputer] Early stopping criterion reached.
Creating imputed Y using Pipeline..
[IterativeImputer] Completing matrix with shape (97, 194)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.13
[IterativeImputer] Change: 5.0, scaled tolerance: 0.011 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.23
[IterativeImputer] Change: 0.0, scaled tolerance: 0.011 
[IterativeImputer] Early stopping criterion reached.

Use the prediction power on the imputed data as measurement of coherent imputation.¶

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
simple_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=0.01, random_state=seed))
scoring = 'accuracy' 
scores = cross_val_score(simple_model, continuous_imputed, y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)

0.773684210526316

Plot features correlation¶

import matplotlib.pyplot as plt
flg = np.all(~np.isnan(whole),axis=1), np.hstack([[True, True], imputer.selection_mask[:]])
corr = np.corrcoef(whole[flg[0],:][:,flg[1]].T)
print(corr.shape)
plt.matshow(corr)

(194, 194)

<matplotlib.image.AxesImage at 0x7f9b93e80190>

Retrieve the categorical imputed labels¶

imputed = np.array([
    encoders[x].inverse_transform(
    y[:, c].astype(int)
 ) for c,x in enumerate(cols_to_encode)]).T

c40_data_imputed = c40_data.copy()

c40_data_imputed[cols_to_encode] = imputed

c40_data_imputed.head()

Find which labels have been updated after imputation¶

imp_lab_freqs = c40_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_ret = pd.merge(lab_freqs, imp_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_ret['increase'] = (imp_ret['size_imp'] - imp_ret['size'])/imp_ret['size']
imp_ret

Make sure that all risks have been encoded and are imputation outcome candidates¶

set([x for x in cw_data['risks_to_citys_water_supply'] if x not in encoders['risks_to_citys_water_supply'].classes_])

set()

Impute whole dataset¶

cw_data['risks_to_citys_water_supply'].value_counts()

Increased water stress or scarcity    205
Declining water quality               127
Inadequate or aging infrastructure    101
nan                                    54
Increased water demand                 49
Higher water prices                    38
Regulatory                             17
Energy supply issues                    7
Name: risks_to_citys_water_supply, dtype: int64

Apply encoding and vectorization¶

based on fitted encoders and SimilarityAnalysis objects

cw_data[cols_to_encode] = cw_data[cols_to_encode].fillna('nan')
encoded = np.array([encoders[x].transform(cw_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
    encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan

all_vectorized = np.hstack([analyses[x].transform(cw_data[x]) for x in value_columns if x in list_columns])

Apply imputation¶

using the trained imputer

all_imputed_x, all_imputed_y  = imputer.transform(np.hstack([all_vectorized,cw_data[len_columns]]), encoded, ret_imputed_x=True)

[IterativeImputer] Completing matrix with shape (598, 192)
[IterativeImputer] Ending imputation round 1/5, elapsed time 0.48
[IterativeImputer] Ending imputation round 2/5, elapsed time 0.99
[IterativeImputer] Ending imputation round 3/5, elapsed time 1.49
[IterativeImputer] Ending imputation round 4/5, elapsed time 2.11
[IterativeImputer] Ending imputation round 5/5, elapsed time 2.72
[IterativeImputer] Completing matrix with shape (598, 194)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.02
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.03

all_imputed_y_dec = np.array([
    encoders[x].inverse_transform(
    all_imputed_y[:, c].astype(int)
 ) for c,x in enumerate(cols_to_encode)]).T

len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())

(598, 598)

cw_data_imputed = cw_data.copy()
cw_data_imputed[cols_to_encode] = all_imputed_y_dec
cw_data_imputed.drop_duplicates(id_columns,inplace=True)

Plot features correlation¶

import matplotlib.pyplot as plt
corr = np.corrcoef(np.hstack([all_imputed_y, all_imputed_x]).T)
print(corr.shape)
plt.matshow(corr)

(194, 194)

<matplotlib.image.AxesImage at 0x7f9b92426c70>

scores = cross_val_score(simple_model, all_imputed_x, all_imputed_y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)

0.7407983193277311

The score did not have significant drop, so we can assume that the imputation was cohesive across data

Create output dataset¶

final_labeled_data = cw_data_imputed[['city','latitude','longitude','country','population','c40','magnitude','risks_to_citys_water_supply']].copy()
final_labeled_data.rename(columns={'magnitude': 'risk','risks_to_citys_water_supply': 'description'},inplace=True)

final_labeled_data[['city','latitude','longitude','c40', 'population','description','risk']]

cw_data_imputed[id_columns].drop_duplicates().shape

(575, 12)

Pivoting risks description, so that each row is unique per city¶

risks = final_labeled_data.description.unique()
risks_description = {risk: f'risk{c}' for c,risk in enumerate(risks)}
risks_df = pd.Series(risks_description).to_frame()
risks_df.reset_index(inplace=True)
risks_df.columns=['description','code']
final_labeled_data['description'] = final_labeled_data['description'].apply(lambda x: risks_description[x])
risks_df

main_index=['city','latitude','longitude', 'country']

Check if there are cities that are listed to be both in and not in C40¶

and drop the ones that are not

c40_check = final_labeled_data.groupby(main_index)['c40'].nunique()
c40_check[c40_check>1]

city      latitude  longitude  country
Santiago  -33.4489  -70.6693   Chile      2
Name: c40, dtype: int64

final_labeled_data.city.nunique()

282

final_labeled_data[final_labeled_data.city == 'Santiago']

final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()
final_labeled_data = final_labeled_data.drop(columns=['c40']).merge(final_labeled_data.groupby(main_index)['c40'].max().reset_index(),on=main_index)
final_labeled_data[final_labeled_data.city=='Santiago']

final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()

Fix population¶

pop_df = final_labeled_data.groupby(index)[['population']
                                          ].max().reset_index().drop_duplicates()
final_labeled_data = final_labeled_data.drop(columns=['population']).merge(pop_df.reset_index(), on=index)

final_labeled_data = final_labeled_data.pivot(index= index + ['c40','population'], columns='description', values='risk').reset_index()

Encode risks from 0 to 1¶

severity_mapping = {'Less Serious':1, "Serious":2, 'Extremely serious':3}
pd.DataFrame([severity_mapping]).to_csv(SEVERITY_MAPPING_PATH,index=False)
nrisks = list(risks_description.values())
final_labeled_data[nrisks] = final_labeled_data[nrisks].replace(severity_mapping)

Add 0 to risks belonging to C40, assuming that those have reported no risks because indeed they were none, not because they were unknown¶

final_labeled_data.loc[final_labeled_data['c40'], nrisks] = final_labeled_data.loc[final_labeled_data['c40'], nrisks].fillna(0)

final_labeled_data.to_csv(LABELED_CITIES_PATH,index=False)
risks_df.to_csv(RISKS_MAPPING_PATH,index=False)

final_labeled_data.head()

See a report of the filled labels¶

cw_lab_freqs = cw_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_lab_freqs = cw_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_ret = pd.merge(cw_lab_freqs, imp_cw_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_cw_ret['increase'] = (imp_cw_ret['size_imp'] - imp_cw_ret['size'])/imp_cw_ret['size']
imp_cw_ret

imp_report = pd.merge(imp_cw_ret, imp_ret, suffixes=('','_c40'), on=('magnitude','risks_to_citys_water_supply'))
imp_report.to_csv(IMPUTATION_REPORT_PATH,index=False)
imp_report

	Account No	Organisation	City	Country	Region	Access	C40	Reporting Year	Risks to city's water supply	Adaptation action	Action description	City Location
0	1093	City of Atlanta	Atlanta	United States of America	North America	public	NaN	2018	Increased water stress or scarcity	Investment in existing water supply infrastruc...	The city is building a new reservoir, fixing t...	(33.749, -84.388)
1	1184	City of Austin	Austin	United States of America	North America	public	C40	2018	Increased water stress or scarcity	Conservation awareness and education	NaN	(30.2672, -97.7431)
2	1184	City of Austin	Austin	United States of America	North America	public	C40	2018	Increased water stress or scarcity	Conservation incentives	NaN	(30.2672, -97.7431)
3	1184	City of Austin	Austin	United States of America	North America	public	C40	2018	Increased water stress or scarcity	Water use restrictions	NaN	(30.2672, -97.7431)
4	1184	City of Austin	Austin	United States of America	North America	public	C40	2018	Increased water stress or scarcity	Efficiency regulations or standards	NaN	(30.2672, -97.7431)

	Account No	Organisation	City	Country	Region	Access	C40	Reporting year	Risks to city's water supply	Timescale	Magnitude	Risk description	Current population	Population year	City Location
0	1093	City of Atlanta	Atlanta	United States of America	North America	public	NaN	2018	Increased water stress or scarcity	Short-term	Serious	The city depends on one river for water supply...	450182.0	2017	(33.749, -84.388)
1	1093	City of Atlanta	Atlanta	United States of America	North America	public	NaN	2018	Higher water prices	Current	Serious	The water cost burden is felt across all ZIP c...	450182.0	2017	(33.749, -84.388)
2	1184	City of Austin	Austin	United States of America	North America	public	C40	2018	Increased water stress or scarcity	Medium-term	Extremely serious	NaN	1265974.0	2017	(30.2672, -97.7431)
3	1499	Ajuntament de Barcelona	Barcelona	Spain	Europe	public	C40	2018	Increased water stress or scarcity	Current	Extremely serious	The downscaling of the climate projections at ...	1604555.0	2015	(41.3823, 2.1775)
4	1499	Ajuntament de Barcelona	Barcelona	Spain	Europe	public	C40	2018	Declining water quality	Medium-term	Serious	A reduction in water resources availability c...	1604555.0	2015	(41.3823, 2.1775)

	region	city	access	country	risks_to_citys_water_supply	c40	organisation	city_location	account_no	reporting_year	adaptation_action	action_description	timescale	magnitude	risk_description	current_population	population_year
0	Africa	Abuja	public	Nigeria	Higher water prices	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	{Water metering}	{nan}	{Short-term}	NaN	{nan}	{2440000.0}	{2017}
1	Africa	Abuja	public	Nigeria	Inadequate or aging infrastructure	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	{Investment in existing water supply infrastru...	{nan}	{nan}	NaN	{nan}	{nan}	{nan}
2	Africa	Abuja	public	Nigeria	Increased water stress or scarcity	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	{Efficiency regulations or standards}	{nan}	{Short-term}	NaN	{nan}	{2440000.0}	{2017}
3	Africa	BouakÃ©	public	CÃ´te d'Ivoire	Declining water quality	False	Commune de BouakÃ©	(7.4137, -5.0149)	831617	2018	{Diversifying water supply (including new sour...	{Approvisionnement en eau potable par la const...	{Current}	Extremely serious	{La ville manque d'eau depuis plus de 5 mois}	{715435.0}	{2013}
4	Africa	Cape Town	public	South Africa	Increased water stress or scarcity	True	City of Cape Town	(-33.9253, 18.4239)	35858	2018	{Water use restrictions, Water metering, Use o...	{In terms of diversifying water supply (includ...	{Current}	Extremely serious	{The City has generally been able to successfu...	{4174510.0}	{2017}

	region	city	access	country	risks_to_citys_water_supply	c40	organisation	city_location	account_no	reporting_year	adaptation_action	action_description	timescale	magnitude	risk_description	current_population	population_year	latitude	longitude
0	Africa	Abuja	public	Nigeria	Higher water prices	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[]	[]	[Short-term]	NaN	[]	2440000.0	2017.0	9.07647	7.39857
1	Africa	Abuja	public	Nigeria	Inadequate or aging infrastructure	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[Investment in existing water supply infrastru...	[]	[]	NaN	[]	NaN	NaN	9.07647	7.39857
2	Africa	Abuja	public	Nigeria	Increased water stress or scarcity	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[Efficiency regulations or standards]	[]	[Short-term]	NaN	[]	2440000.0	2017.0	9.07647	7.39857
3	Africa	BouakÃ©	public	CÃ´te d'Ivoire	Declining water quality	False	Commune de BouakÃ©	(7.4137, -5.0149)	831617	2018	[Diversifying water supply (including new sour...	[]	[]	Extremely serious	[]	715435.0	2013.0	7.41370	-5.01490
4	Africa	Cape Town	public	South Africa	Increased water stress or scarcity	True	City of Cape Town	(-33.9253, 18.4239)	35858	2018	[Water use restrictions, Use of non-potable wa...	[In terms of diversifying water supply (includ...	[]	Extremely serious	[The City has generally been able to successfu...	4174510.0	2017.0	-33.92530	18.42390

	region	city	access	country	risks_to_citys_water_supply	c40	organisation	city_location	account_no	reporting_year	adaptation_action	action_description	timescale	magnitude	risk_description	current_population	population_year	latitude	longitude
0	Africa	Abuja	public	Nigeria	Higher water prices	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[]	[]	[Short-term]	NaN	[]	2440000.0	2017.0	9.07647	7.39857
1	Africa	Abuja	public	Nigeria	Inadequate or aging infrastructure	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[Investment in existing water supply infrastru...	[]	[]	NaN	[]	NaN	0.0	9.07647	7.39857
2	Africa	Abuja	public	Nigeria	Increased water stress or scarcity	False	Abuja Federal Capital Territory	(9.07647, 7.39857)	36043	2018	[Efficiency regulations or standards]	[]	[Short-term]	NaN	[]	2440000.0	2017.0	9.07647	7.39857
3	Africa	BouakÃ©	public	CÃ´te d'Ivoire	Declining water quality	False	Commune de BouakÃ©	(7.4137, -5.0149)	831617	2018	[Diversifying water supply (including new sour...	[]	[]	Extremely serious	[]	715435.0	2013.0	7.41370	-5.01490
4	Africa	Cape Town	public	South Africa	Increased water stress or scarcity	True	City of Cape Town	(-33.9253, 18.4239)	35858	2018	[Water use restrictions, Use of non-potable wa...	[In terms of diversifying water supply (includ...	[]	Extremely serious	[The City has generally been able to successfu...	4174510.0	2017.0	-33.92530	18.42390
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
669	Southeast Asia and Oceania	City of Sydney	public	Australia	Regulatory	True	City of Sydney	(-33.8675, 151.207)	31114	2018	[]	[]	[]	Serious	[Regulatory, pricing and institutional arrange...	233219.0	2017.0	-33.86750	151.20700
670	Southeast Asia and Oceania	Ho Chi Minh	public	Viet Nam	Declining water quality	True	Ho Chi Minh City	(10.7626, 106.66)	31164	2018	[]	[]	[]	Serious	[]	8611000.0	2017.0	10.76260	106.66000
671	Southeast Asia and Oceania	Ho Chi Minh	public	Viet Nam	Higher water prices	True	Ho Chi Minh City	(10.7626, 106.66)	31164	2018	[]	[]	[Short-term]	Less Serious	[]	8611000.0	2017.0	10.76260	106.66000
672	Southeast Asia and Oceania	Ho Chi Minh	public	Viet Nam	Inadequate or aging infrastructure	True	Ho Chi Minh City	(10.7626, 106.66)	31164	2018	[]	[]	[Short-term]	Less Serious	[]	8611000.0	2017.0	10.76260	106.66000
673	Southeast Asia and Oceania	Ho Chi Minh	public	Viet Nam	Increased water demand	True	Ho Chi Minh City	(10.7626, 106.66)	31164	2018	[]	[]	[]	Less Serious	[]	8611000.0	2017.0	10.76260	106.66000

	region	city	access	country	c40	organisation	city_location	account_no	reporting_year	latitude	longitude	low	medium	high	unknown
0	Africa	Addis Ababa	public	Ethiopia	True	Addis Ababa City Administration	(9.02887, 38.7544)	31146	2018	9.02887	38.75440	0	3	0	0
1	Africa	Cape Town	public	South Africa	True	City of Cape Town	(-33.9253, 18.4239)	35858	2018	-33.92530	18.42390	0	0	1	0
2	Africa	Dar es Salaam	public	United Republic of Tanzania	True	City of Dar es Salaam	(-6.79235, 39.2083)	35893	2018	-6.79235	39.20830	0	1	0	1
3	Africa	Durban	public	South Africa	True	City of Durban	(-29.8587, 31.0218)	35863	2018	-29.85870	31.02180	0	0	0	1
4	Africa	Johannesburg	public	South Africa	True	City of Johannesburg	(-26.2041, 28.0473)	31115	2018	-26.20410	28.04730	0	0	0	1
5	Africa	Lagos	public	Nigeria	True	City of Lagos	(6.52437, 3.3792)	31167	2018	6.52437	3.37920	0	4	2	1
6	Africa	Nairobi	public	Kenya	True	City of Nairobi	(-1.29206, 36.8219)	35913	2018	-1.29206	36.82190	0	2	0	0
7	East Asia	Changwon	public	Republic of Korea	True	Changwon City	(35.1542, 126.949)	31186	2018	35.15420	126.94900	0	1	0	0
8	East Asia	Dalian	public	China	True	Dalian Municipal People's Government	(38.9389, 121.567)	54298	2018	38.93890	121.56700	0	0	0	2
9	East Asia	Hong Kong	public	China, Hong Kong Special Administrative Region	True	Government of Hong Kong Special Administrative...	(22.2881, 114.14)	31169	2018	22.28810	114.14000	0	1	0	1
10	East Asia	Tokyo	public	Japan	True	Tokyo Metropolitan Government	(35.41, 139.41)	31111	2018	35.41000	139.41000	0	3	0	1
11	East Asia	Yokohama	public	Japan	True	City of Yokohama	(35.4437, 139.638)	31113	2018	35.44370	139.63800	0	1	0	0
12	Europe	Barcelona	public	Spain	True	Ajuntament de Barcelona	(41.3823, 2.1775)	1499	2018	41.38230	2.17750	0	2	1	1
13	Europe	London	public	United Kingdom of Great Britain and Northern I...	True	Greater London Authority	(51.5048, -0.07868)	3422	2018	51.50480	-0.07868	1	2	0	0
14	Europe	Oslo	public	Norway	True	City of Oslo	(59.9139, 10.7522)	14088	2018	59.91390	10.75220	1	0	1	1
15	Europe	Paris	public	France	True	City of Paris	(48.8566, 2.35222)	31175	2018	48.85660	2.35222	0	0	0	2
16	Europe	Roma	public	Italy	True	Roma Capitale	(41.53, 12.2858)	31110	2018	41.53000	12.28580	0	0	1	1
17	Europe	Stockholm	public	Sweden	True	City of Stockholm	(59.3293, 18.0686)	3429	2018	59.32930	18.06860	1	1	0	0
18	Europe	Warsaw	public	Poland	True	City of Warsaw	(52.2297, 21.0122)	31185	2018	52.22970	21.01220	0	1	0	0
19	Latin America	Curitiba	public	Brazil	True	Municipality of Curitiba	(-25.4311, -49.2647)	31156	2018	-25.43110	-49.26470	1	0	0	0
20	Latin America	Guadalajara	public	Mexico	True	Region Metropolitana de Guadalajara	(20.6597, -103.35)	35867	2018	20.65970	-103.35000	0	0	0	1
21	Latin America	MedellÃn	public	Colombia	True	Municipality of MedellÃn	(6.22729, -75.5735)	35873	2018	6.22729	-75.57350	0	2	0	0
22	Latin America	Mexico City	public	Mexico	True	Mexico City	(19.4326, -99.1332)	31172	2018	19.43260	-99.13320	0	0	1	1
23	Latin America	Quito	public	Ecuador	True	Distrito Metropolitano de Quito	(-0.2333, -78.5167)	42178	2018	-0.23330	-78.51670	0	1	0	0
24	Latin America	Rio de Janeiro	public	Brazil	True	Prefeitura do Rio de Janeiro	(-22.9054, -43.5614)	31176	2018	-22.90540	-43.56140	0	2	0	1
25	Latin America	Santiago	public	Chile	True	RegiÃ³n Metropolitana de Santiago	(-33.4489, -70.6693)	31180	2018	-33.44890	-70.66930	0	3	0	0
26	Latin America	SÃ£o Paulo	public	Brazil	True	Prefeitura de SÃ£o Paulo	(-23.5595, -46.7382)	31184	2018	-23.55950	-46.73820	0	1	0	0
27	Middle East	Amman	public	Jordan	True	Greater Amman Municipality	(31.9454, 35.9284)	54337	2018	31.94540	35.92840	0	0	2	0
28	North America	Austin	public	United States of America	True	City of Austin	(30.2672, -97.7431)	1184	2018	30.26720	-97.74310	0	0	1	0
29	North America	Portland, OR	public	United States of America	True	City of Portland, OR	(45.52, -122.682)	14874	2018	45.52000	-122.68200	0	0	0	4
30	North America	San Francisco	public	United States of America	True	City of San Francisco	(37.7749, -122.419)	31182	2018	37.77490	-122.41900	0	1	0	0
31	North America	Seattle	public	United States of America	True	City of Seattle	(47.6062, -122.332)	16581	2018	47.60620	-122.33200	0	0	0	1
32	North America	Vancouver	public	Canada	True	City of Vancouver	(49.2612, -123.114)	20113	2018	49.26120	-123.11400	1	0	0	0
33	North America	Washington, DC	public	United States of America	True	District of Columbia	(38.9072, -77.0369)	31090	2018	38.90720	-77.03690	0	2	0	1
34	South and West Asia	Dhaka	public	Bangladesh	True	Dhaka City	(23.8103, 90.4125)	50782	2018	23.81030	90.41250	0	0	0	4
35	South and West Asia	Karachi	public	Pakistan	True	City District Government Karachi	(24.9108, 67.1281)	31168	2018	24.91080	67.12810	0	0	1	0
36	South and West Asia	Kolkata	public	India	True	Kolkata Metropolitan Area	(22.5726, 88.3639)	35904	2018	22.57260	88.36390	0	0	0	1
37	Southeast Asia and Oceania	Auckland	public	New Zealand	True	Auckland Council	(-36.8153, 174.742)	43932	2018	-36.81530	174.74200	0	2	0	1
38	Southeast Asia and Oceania	Bangkok	public	Thailand	True	Bangkok Metropolitan Administration	(13.7712, 100.556)	31150	2018	13.77120	100.55600	1	0	0	0
39	Southeast Asia and Oceania	City of Melbourne	public	Australia	True	City of Melbourne	(-37.8141, 144.963)	31109	2018	-37.81410	144.96300	0	2	0	0
40	Southeast Asia and Oceania	City of Sydney	public	Australia	True	City of Sydney	(-33.8675, 151.207)	31114	2018	-33.86750	151.20700	0	4	0	1
41	Southeast Asia and Oceania	Ho Chi Minh	public	Viet Nam	True	Ho Chi Minh City	(10.7626, 106.66)	31164	2018	10.76260	106.66000	3	1	0	0
42	Southeast Asia and Oceania	Jakarta	public	Indonesia	True	Jakarta City Government	(-6.1751, 106.865)	31166	2018	-6.17510	106.86500	0	0	0	5

	description	code
0	Higher water prices	risk0
1	Inadequate or aging infrastructure	risk1
2	Increased water stress or scarcity	risk2
3	Declining water quality	risk3
4	Increased water demand	risk4
5	Regulatory	risk5
6	Energy supply issues	risk6

	city	latitude	longitude	country	population	c40	risk	description
563	Santiago	-33.4489	-70.6693	Chile	404495.0	False	Extremely serious	risk3
564	Santiago	-33.4489	-70.6693	Chile	7112808.0	True	Serious	risk3
565	Santiago	-33.4489	-70.6693	Chile	404495.0	False	Extremely serious	risk4
566	Santiago	-33.4489	-70.6693	Chile	404495.0	False	Extremely serious	risk2
567	Santiago	-33.4489	-70.6693	Chile	7112808.0	True	Serious	risk2
568	Santiago	-33.4489	-70.6693	Chile	7112808.0	True	Serious	risk5

description	city	latitude	longitude	country	c40	population	risk0	risk1	risk2	risk3	risk4	risk5	risk6
0	Abasan Al-Kabira	31.32310	34.34400	State of Palestine	False	32000.0	NaN	2.0	2.0	2.0	NaN	NaN	NaN
1	Abington	40.12410	-75.11950	United States of America	False	55310.0	NaN	1.0	1.0	2.0	NaN	NaN	NaN
2	Abuja	9.07647	7.39857	Nigeria	False	2440000.0	2.0	2.0	2.0	NaN	NaN	NaN	NaN
3	Addis Ababa	9.02887	38.75440	Ethiopia	True	3103374.0	0.0	2.0	2.0	0.0	2.0	0.0	0.0
4	Adelaide	-34.92850	138.60100	Australia	False	23916.0	2.0	2.0	2.0	NaN	NaN	2.0	NaN

	risks_to_citys_water_supply	magnitude	size	size_imp	increase
0	Declining water quality	Extremely serious	23	24	0.043478
1	Declining water quality	Less Serious	18	18	0.000000
2	Declining water quality	Serious	57	89	0.561404
3	Energy supply issues	Extremely serious	1	1	0.000000
4	Energy supply issues	Less Serious	1	1	0.000000
5	Energy supply issues	Serious	5	5	0.000000
6	Higher water prices	Extremely serious	2	2	0.000000
7	Higher water prices	Less Serious	10	11	0.100000
8	Higher water prices	Serious	14	25	0.785714
9	Inadequate or aging infrastructure	Extremely serious	3	4	0.333333
10	Inadequate or aging infrastructure	Less Serious	8	8	0.000000
11	Inadequate or aging infrastructure	Serious	11	98	7.909091
12	Increased water demand	Extremely serious	7	7	0.000000
13	Increased water demand	Less Serious	12	12	0.000000
14	Increased water demand	Serious	30	32	0.066667
15	Increased water stress or scarcity	Extremely serious	39	42	0.076923
16	Increased water stress or scarcity	Less Serious	21	20	-0.047619
17	Increased water stress or scarcity	Serious	94	159	0.691489
18	Regulatory	Less Serious	4	4	0.000000
19	Regulatory	Serious	8	13	0.625000